
************************************************************************************************
************************************************************************************************
*This .do file estimates the structural model and creates Table 1 and Figure 1

*Input:
*1. "student_mb_2009to2017.dta" (created by "Data_get_2009_17.do")

*Results created by .do file (in order):
*1. Table 1 Column (2) (without standard errors)
*2. Figure 1(b)
************************************************************************************************
************************************************************************************************

clear all
use "/data_analysis/NC_RD_Retake/student_mb_2009to2017.dta"
set more off
set seed 55555

*No need for post-2012 data
drop if year>2012
*No grade 8 
drop if grade==8
*Drop extend tests (disabled tests)
replace smathscal=. if nc_extend_math==1
replace sreadscal=. if nc_extend_read==1
drop nc_extend_math nc_extend_math_retake nc_extend_read nc_extend_read_retake
*Need current or subsequent scores in math
drop if sreadscal==.
drop if Fsreadscal==.

*Create a standardized retest gain (for each grade-year combination)
gen read_retake_stan=.
gen read_stan=.
gen retest_gain_stan=.
foreach g of numlist 3(1)7{
foreach t of numlist 2009(1)2012{
su read if grade==`g' & year==`t'
replace read_retake_stan=(read_retake-r(mean))/r(sd) if grade==`g' & year==`t'
replace read_stan=(read-r(mean))/r(sd) if grade==`g' & year==`t'
replace retest_gain_stan=read_retake_stan-read_stan if grade==`g' & year==`t'
}
}
*Save so do not need to rerun above every time in loop 
compress
save "/data_analysis/NC_RD_Retake/very_temp.dta", replace



***Now run the structural model to find the parameters***
clear all
set seed 55555
use "/data_analysis/NC_RD_Retake/very_temp.dta"
*Drop uneeded vars
drop mastid year grade lea schlcode schoolid retested_m math_retake read retested_r read_retake sex lep swd eds aig_math aig_read repeating repeat_next_year ethnic charter Fsmathscal F2smathscal F3smathscal F4smathscal lag_sreadscal sreadscal Fsreadscal F2sreadscal F3sreadscal F4sreadscal math_reg_date math_retest_date read_reg_date read_retest_date

*Create a 'grid' to discretize the simulated score in 0.1\sigma increments
*Max and min of grade
su read_stan if running_read==0.5
local max=r(max)
su read_stan if running_read==-10.5
local min=r(min)

*Create 0.1\sigma bins; these are used as our discretized "bins" in the structural model
foreach b of numlist 1(1)14{
local bin`b'_min=(-0.3-`b'*0.1)-0.05
local bin`b'_max=(-0.3-`b'*0.1)+0.05
local bin`b'=(-0.3-`b'*0.1)
}
display `bin1_min', `bin1_max', `bin1'
display `bin14_min', `bin14_max', `bin14'

*Drop if missing retest and in retest range
drop if retest_gain_stan==. & read_stan>`bin14_min' & read_stan<`bin1_max'

*All Grade average reliability: 0.9118
display 1*(1-0.9118)^0.5

local sd=(0.9118)^0.5
set obs 2500000
gen ability=rnormal(0,`sd')
su ability
display r(sd)^2
local sim_var=r(sd)^2

*Make var exactly 0.9118
replace ability=((0.9118/`sim_var')^0.5)*ability
su ability
display r(sd)^2

gen abs_ability=abs(ability)

*Drop anything unecessary for loop
drop math running_math lag_smathscal smathscal read_retake_stan
qui compress

************************************************************************************************************************************************
************************************************************************************************************************************************
*****Did big search, now narrrowing search to theta_0 between 0.15 and 0.25, theta_1 between and 0.03 and 0.20, gain between 0.10 and 0.15*****
************************************************************************************************************************************************
************************************************************************************************************************************************

*Now loop through the theta_0s to find optimal
foreach t of numlist 0.25(0.001)0.29{
preserve
display "Theta 0 is `t'"
*Parameters
local theta_0=`t'

qui gen sd=.
qui gen min=.
*Here are the theta_1 guesses
foreach n of numlist 0.001(0.01)0.065{
local obs=`n'*10000
local theta_1=`n'
qui gen var=(`theta_0' + `theta_1'*(abs_ability))^2
qui gen sem=(var)^0.5
qui gen test_score_sim=ability+rnormal(0,sem)
qui su test_score_sim
qui replace sd=abs(r(sd)-1) if _n==`obs'
qui replace min=`n' if _n==`obs'
qui drop var sem test_score_sim
}
qui egen min_sd=min(sd)
qui su min if sd==min_sd

*Generate theta 1
local theta_1=r(mean)
qui drop min_sd sd min

qui gen var=(`theta_0' + `theta_1'*(abs_ability))^2
qui gen sem=(var)^0.5
qui gen test_score_sim=ability+rnormal(0,sem)
qui su test_score_sim
local temp=r(sd)
display "Ensure test score is normal: `temp'"

qui gen sim_retest_gain=ability-test_score_sim

*Discretize simulated test score
foreach n of numlist 1(1)14{
qui replace test_score_sim=`bin`n'' if test_score_sim>`bin`n'_min' & test_score_sim<`bin`n'_max'
}

*Cutoff
qui gen real_ind=(read_stan>`bin14_min' & read_stan<`bin1_max')
qui gen sim_ind=(test_score_sim>`bin14_min' & test_score_sim<`bin1_max')

qui keep if real_ind==1 | sim_ind==1

qui expand 2, gen(g)
qui drop if real_ind==0 & g==0
qui drop if real_ind==1 & g==1

qui replace retest_gain_stan=. if g==1
qui replace sim_retest_gain=. if g==0

*drop missing values (only occurs in the real data)
qui drop if retest_gain_stan==. & g==0

qui replace sim_ind=g

qui gen r_gain=retest_gain_stan if g==0
qui replace r_gain=sim_retest_gain if g==1

qui gen test=read_stan if g==0
qui replace test=test_score_sim if g==1

*Discretize real test score
foreach n of numlist 1(1)14{
qui replace test=`bin`n'' if test>`bin`n'_min' & test<`bin`n'_max' & g==0
}

qui gen number=1 if g==0
qui collapse (sum) number (mean) retest_gain_stan sim_retest_gain, by(test)

*Find test gain that minimizes MSE
qui set obs 350
qui gen min=.
qui gen gain=.
foreach gain of numlist 0.04(0.001)0.09{
local n=`gain'*1000
qui gen SST=(retest_gain_stan-sim_retest_gain-`gain')^2
qui su SST
qui replace min=r(mean) if _n==`n'
qui replace gain=`gain' if _n==`n'
qui drop SST
}
qui egen min_min=min(min)
qui su gain if min==min_min
qui local gain=r(mean)
qui su min if min==min_min
qui local min=r(mean)

display "Gain is `gain' and MSE is `min' and (theta0, theta1) is (`t',`theta_1')"
qui gen theta0=`t'
qui gen theta1=`theta_1'
qui gen struc_gain=`gain'
qui gen Error=`min'
qui keep if _n==1
local z=`t'*10000
qui compress
qui save "/data_analysis/NC_RD_Retake/extreme_temp_`z'.dta", replace
restore
}

clear all
foreach t of numlist 0.25(0.001)0.29{
local z=`t'*10000
append using "/data_analysis/NC_RD_Retake/extreme_temp_`z'.dta"
erase "/data_analysis/NC_RD_Retake/extreme_temp_`z'.dta"
}

*These are the parameters; they are reported in Table 1 and are now used to create Figure 1
egen min_Error=min(Error)
browse if Error==min_Error




*********************************************************************************************
*********************************************************************************************
*********Now with optimal choices, graph the resulting real and predicted data***************
*********************************************************************************************
*********************************************************************************************
clear all
set seed 55555
use "/data_analysis/NC_RD_Retake/very_temp.dta"
*Drop uneeded vars
drop mastid year grade lea schlcode schoolid retested_m math_retake read retested_r read_retake sex lep swd eds aig_math aig_read repeating repeat_next_year ethnic charter Fsmathscal F2smathscal F3smathscal F4smathscal lag_sreadscal sreadscal Fsreadscal F2sreadscal F3sreadscal F4sreadscal math_reg_date math_retest_date read_reg_date read_retest_date

*******************************
*******************************
*Parameters
local theta_0=0.257
local theta_1=0.041
local alpha=0.083
*MSE==0.000099
*******************************
*******************************

*Create a 'grid' to discretize the simulated score in 0.1\sigma increments
*Max and min of grade
su read_stan if running_read==0.5
local max=r(max)
su read_stan if running_read==-10.5
local min=r(min)

foreach b of numlist 1(1)14{
local bin`b'_min=(-0.3-`b'*0.1)-0.05
local bin`b'_max=(-0.3-`b'*0.1)+0.05
local bin`b'=(-0.3-`b'*0.1)
}
display `bin1_min', `bin1_max', `bin1'
display `bin14_min', `bin14_max', `bin14'

*Drop if missing retest and in retest range
drop if retest_gain_stan==. & read_stan>`bin14_min' & read_stan<`bin1_max'

*All Grade average reliability: 0.9118
display 1*(1-0.9118)^0.5

local sd=(0.9118)^0.5
set obs 2500000
gen ability=rnormal(0,`sd')
su ability
display r(sd)^2
local sim_var=r(sd)^2

*Make var exactly 0.9118
replace ability=((0.9118/`sim_var')^0.5)*ability
su ability
display r(sd)^2

gen abs_ability=abs(ability)

*qui gen var=(`theta_0')^2 + 2*`theta_0'*`theta_1'*abs_ability + (`theta_1')^2*(abs_ability)^2

qui gen var=(`theta_0' + `theta_1'*(abs_ability))^2
qui gen sem=(var)^0.5
qui gen noise=rnormal(0,sem)
qui gen test_score_sim=ability+rnormal(0,sem)

su noise if ability<-0.5 & ability>-1

su test_score_sim
local temp=r(sd)
display "Ensure test score is normal: `temp'"

qui gen sim_retest_gain=ability-test_score_sim

*Discretize simulated test score
foreach n of numlist 1(1)14{
qui replace test_score_sim=`bin`n'' if test_score_sim>`bin`n'_min' & test_score_sim<`bin`n'_max'
}

*Cutoff
qui gen real_ind=(read_stan>`bin14_min' & read_stan<`bin1_max')
qui gen sim_ind=(test_score_sim>`bin14_min' & test_score_sim<`bin1_max')

qui keep if real_ind==1 | sim_ind==1

qui expand 2, gen(g)
qui drop if real_ind==0 & g==0
qui drop if real_ind==1 & g==1

qui replace retest_gain_stan=. if g==1
qui replace sim_retest_gain=. if g==0

*drop missing values (only occurs in the real data)
qui drop if retest_gain_stan==. & g==0

qui replace sim_ind=g

qui gen r_gain=retest_gain_stan if g==0
qui replace r_gain=sim_retest_gain if g==1

qui gen test=read_stan if g==0
qui replace test=test_score_sim if g==1

*Discretize real test score
foreach n of numlist 1(1)14{
qui replace test=`bin`n'' if test>`bin`n'_min' & test<`bin`n'_max' & g==0
}

*tab test if g==0, su(r_gain) 
*tab test if g==1, su(r_gain) 

qui gen number=1 if g==0
qui collapse (sum) number (mean) retest_gain_stan sim_retest_gain, by(test)

replace sim_retest_gain=sim_retest_gain+`alpha'

*Renam the test bins back into scaled scores
gsort - test 

*Shift it back toward the retest threshold (average of 0.36; call it 0.35)
replace test=test+0.35

****Figure 1(b)*****
twoway (connected retest_gain_stan test, lpattern(dash) lcolor(dkgreen) mcolor(dkgreen) msymbol(square)) (connected sim_retest_gain test, lpattern(dash) msymbol(triangle)), xlabel(0(.2)-1.4) xtitle("Distance from Retest Threshold (English ({&sigma}))") ylabel(0(0.15)0.45) ytitle("Test Score Gain in English Retest ({&sigma})") legend (order(1 "Actual Retest Gain" 2 "Simulated Retest Gain")) graphregion(color(white)) bgcolor(white)





